Data loading and preprocessing

1. Import needed libraries

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
In [3]:
# load data and get some insight
pdata = pd.read_csv("concrete.csv")
pdata.describe().transpose()
Out[3]:
count mean std min 25% 50% 75% max
cement 1030.0 281.167864 104.506364 102.00 192.375 272.900 350.000 540.0
slag 1030.0 73.895825 86.279342 0.00 0.000 22.000 142.950 359.4
ash 1030.0 54.188350 63.997004 0.00 0.000 0.000 118.300 200.1
water 1030.0 181.567282 21.354219 121.80 164.900 185.000 192.000 247.0
superplastic 1030.0 6.204660 5.973841 0.00 0.000 6.400 10.200 32.2
coarseagg 1030.0 972.918932 77.753954 801.00 932.000 968.000 1029.400 1145.0
fineagg 1030.0 773.580485 80.175980 594.00 730.950 779.500 824.000 992.6
age 1030.0 45.662136 63.169912 1.00 7.000 28.000 56.000 365.0
strength 1030.0 35.817961 16.705742 2.33 23.710 34.445 46.135 82.6
In [5]:
sns.boxplot(pdata['age'])
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x12ac13ee0>
In [6]:
# the above analysis suggest that we have some outliers
pdata.age.fillna(pdata.age.median(), inplace = True)
In [11]:
sns.boxplot(pdata['slag'])
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d49eb80>
In [7]:
# the above analysis suggest that we have some outliers
pdata.slag.fillna(pdata.slag.median(), inplace = True)
In [70]:
sns.boxplot(pdata['superplastic'])
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x14237da90>
In [8]:
# the above analysis suggest that we have some outliers
pdata.superplastic.fillna(pdata.superplastic.median(), inplace = True)
In [14]:
pdata.dropna(inplace = True) 
In [15]:
print(pdata.shape)
pdata.head()
(1030, 9)
Out[15]:
cement slag ash water superplastic coarseagg fineagg age strength
0 141.3 212.0 0.0 203.5 0.0 971.8 748.5 28 29.89
1 168.9 42.2 124.3 158.3 10.8 1080.8 796.2 14 23.51
2 250.0 0.0 95.7 187.4 5.5 956.9 861.2 28 29.22
3 266.0 114.0 0.0 228.0 0.0 932.0 670.0 28 45.85
4 154.8 183.4 0.0 193.3 9.1 1047.4 696.7 28 18.29
In [16]:
# check for null value
pdata.isnull().sum()
Out[16]:
cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

Base on the sumarize above, we do not see we have any bad or missing data.

In [17]:
unique_values = pdata.nunique()
print('Count unique values in each column')
print(unique_values)
Count unique values in each column
cement          278
slag            185
ash             156
water           195
superplastic    111
coarseagg       284
fineagg         302
age              14
strength        845
dtype: int64
In [43]:
sns.distplot(pdata.strength, kde=False)
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x13bcc9850>
In [18]:
sns.distplot(pdata.slag, kde=False)
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d539970>
In [45]:
sns.distplot(pdata.ash, kde=False)
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x13ba73190>
In [19]:
sns.distplot(pdata.superplastic, kde=False)
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d56ed60>
In [20]:
pdata.superplastic.describe()
Out[20]:
count    1030.000000
mean        6.204660
std         5.973841
min         0.000000
25%         0.000000
50%         6.400000
75%        10.200000
max        32.200000
Name: superplastic, dtype: float64
In [21]:
sns.distplot(pdata.coarseagg, kde=False)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d642af0>
In [48]:
sns.distplot(pdata.fineagg, kde=False)
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x14092b040>
In [22]:
sns.distplot(pdata.age, kde=False)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d743e20>

We see the data is not too smooth and some outliers

In [23]:
sns.distplot(pdata.cement, kde=False)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d83aac0>
In [24]:
sns.distplot(pdata.water, kde=False)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d9065b0>

We see some ouliers here

In [25]:
pdata.hist(bins=20, figsize=(16,12))
plt.show()
In [74]:
corr = pdata.corr() 
sns.heatmap(corr, annot=True)
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x142debeb0>

Base on above plot, we can see cement, superplastic and age are those major factors impact strength

In [26]:
sns.pairplot(pdata)
Out[26]:
<seaborn.axisgrid.PairGrid at 0x12dcaf7c0>

Base on above plot, we can see strength increase when cement, age, and superplastic increase

Feature Engineering techniques

Data Preprocessing

In [9]:
X = pdata.drop('strength', axis=1)
Y = pdata['strength']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
X_train.head()
Out[9]:
cement slag ash water superplastic coarseagg fineagg age
196 231.8 0.0 121.6 174.0 6.7 1056.4 778.5 14
631 446.0 24.0 79.0 162.0 11.6 967.0 712.0 3
81 116.0 173.0 0.0 192.0 0.0 909.8 891.9 90
526 500.0 0.0 0.0 200.0 0.0 1125.0 613.0 3
830 425.0 106.3 0.0 153.5 16.5 852.1 887.1 7
In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [11]:
def label_feature(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate('{:.2f}'.format(height), xy=(bar.get_x() + bar.get_width() / 2, height), ha='center', va='bottom')
In [15]:
features = ["cement", "slag", "ash", "water", "superplastic", "coarseagg", "fineagg", "age"]
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
Y_pred_lr = lr.predict(X_test)
score1 = lr.score(X_test, Y_test)
score1
x = np.arange(len(features))
width = 0.3
fig, ax = plt.subplots(figsize=(8,6))

bars = ax.bar(x, lr.coef_, width)
ax.set_ylabel('importances')
ax.set_xlabel('features')
ax.set_xticks(x)
ax.set_xticklabels(features, rotation=90)

label_feature(bars)
fig.tight_layout()
plt.show()
In [20]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)
score2 = rf.score(X_test, Y_test)
print(rf.get_params())
score2
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Out[20]:
0.9017618541823964
In [21]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()

dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)
score3 = dt.score(X_test, Y_test)
print(dt.get_params())
score3
{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': 'deprecated', 'random_state': None, 'splitter': 'best'}
Out[21]:
0.8819872244342749
In [16]:
fig, ax = plt.subplots(figsize=(10,6))
dt_bars = ax.bar(x-(width/2), dt.feature_importances_, width, label='Decision Tree')
rf_bars = ax.bar(x+(width/2), rf.feature_importances_, width, label='Random Forest')

ax.set_ylabel('importance')
ax.set_xlabel('features')
ax.set_xticks(x)
ax.set_xticklabels(features, rotation=90)
ax.legend(loc="upper left", bbox_to_anchor=(1,1))

label_feature(dt_bars)
label_feature(rf_bars)
plt.show()

Hyperparameter tuning

In [22]:
from sklearn.model_selection import GridSearchCV
rf_param_grid = {'bootstrap': [True],
    'max_depth': [8, 9, 10, 11],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 20, 30, 100]
    }

gscv_rf = GridSearchCV(rf, rf_param_grid, cv=10)
gscv_rf.fit(X_train,Y_train)
print("r square for Random Forest: {}".format(gscv_rf.best_score_))
print("best hyperparameters for Random Forest: {}".format(gscv_rf.best_params_))
r square for Random Forest: 0.8639654236991827
best hyperparameters for Random Forest: {'bootstrap': True, 'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 30}
In [34]:
dt_param_grid = {"criterion": ["mse", "mae"],
              "min_samples_split": [2, 4, 8],
              "max_depth": [2, 6, 8],
              "min_samples_leaf": [2, 4, 10],
              "max_leaf_nodes": [10, 40, 400],
              }
gscv_dt = GridSearchCV(dt, dt_param_grid, cv=10)
gscv_dt.fit(X_train,Y_train)
print("r square for Decision Trees: {}".format(gscv_dt.best_score_))
print("best hyperparameters for Decision Trees: {}".format(gscv_dt.best_params_))
r square for Decision Trees: 0.7960733077557982
best hyperparameters for Decision Trees: {'criterion': 'mae', 'max_depth': 8, 'max_leaf_nodes': 400, 'min_samples_leaf': 2, 'min_samples_split': 2}
In [36]:
def evaluate(model, x, y):
    predictions = model.predict(x)
    errors = abs(predictions - y)
    mape = 100 * np.mean(errors / y)
    accuracy = 100 - mape
    print('Average Error = {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
In [41]:
# Random Forest evaluate
rf_accuracy = evaluate(rf, X_test, Y_test)
Average Error: 3.4370 degrees.
Accuracy = 88.35%.
In [40]:
# Random Forest after tuning
rf_tunning_best = gscv_rf.best_estimator_
rf_tunning_accuracy = evaluate(rf_tunning_best, X_test, Y_test)
Average Error: 4.3817 degrees.
Accuracy = 84.42%.
In [38]:
# Decision Tree evaluate
dt_accuracy = evaluate(dt, X_test, Y_test)
Average Error: 3.7638 degrees.
Accuracy = 85.98%.
In [37]:
# Decision Tree after tuning
dt_tunning_best = gscv_dt.best_estimator_
dt_tunning_accuracy = evaluate(dt_tunning_best, X_test, Y_test)
Average Error: 4.6460 degrees.
Accuracy = 84.42%.
In [ ]: